import numpy as np
import pandas as pd
import dask.dataframe as dd
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go
%%time
df = dd.read_parquet(r"C:\Users\prath\OneDrive\Desktop\microsoft_malware_prediction\Data Exploration\train.parquet")
df = df.compute()
CPU times: total: 23.4 s Wall time: 1min 20s
print(df.shape)
(8921483, 83)
%%time
stats = []
for col in df.columns:
stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)
CPU times: total: 12.5 s Wall time: 15.9 s
| Feature | Unique_values | Percentage of missing values | Percentage of values in the biggest category | type | |
|---|---|---|---|---|---|
| 28 | PuaMode | 2 | 99.974119 | 99.974119 | category |
| 41 | Census_ProcessorClass | 3 | 99.589407 | 99.589407 | category |
| 8 | DefaultBrowsersIdentifier | 2017 | 95.141637 | 95.141637 | float32 |
| 68 | Census_IsFlightingInternal | 2 | 83.044030 | 83.044030 | float32 |
| 52 | Census_InternalBatteryType | 78 | 71.046809 | 71.046809 | category |
| ... | ... | ... | ... | ... | ... |
| 1 | ProductName | 6 | 0.000000 | 98.935569 | category |
| 45 | Census_HasOpticalDiskDrive | 2 | 0.000000 | 92.281272 | int16 |
| 54 | Census_OSVersion | 469 | 0.000000 | 15.845202 | category |
| 55 | Census_OSArchitecture | 3 | 0.000000 | 90.858045 | category |
| 82 | HasDetections | 2 | 0.000000 | 50.020731 | int16 |
83 rows × 5 columns
train= pd.DataFrame(df)
good_cols = list(df.columns)
for col in df.columns:
rate = df[col].value_counts(normalize=True, dropna=False).values[0]
if rate > 0.9: # Removing Missing Values if
good_cols.remove(col)
train= df[good_cols]
def plot_category_percent_of_target(col,color):
cat_percent = train[[col, 'HasDetections']].groupby(col, as_index=False).mean()
cat_size = train[col].value_counts().reset_index(drop=False)
cat_size.columns = [col, 'count']
cat_percent = cat_percent.merge(cat_size, on=col, how='left')
cat_percent['HasDetections'] = cat_percent['HasDetections'].fillna(0)
cat_percent = cat_percent.sort_values(by='count', ascending=False)[:20]
fig = px.bar(cat_percent, x='HasDetections', y=col, orientation='h',
hover_data={'HasDetections': ':.2f', 'count': True},
color='HasDetections', color_continuous_scale=color)
fig.update_layout(title='Percentage of HasDetections vs {}'.format(col),
xaxis_title='% of HasDetections (target)', yaxis_title=col)
fig.show()
plot_category_percent_of_target("EngineVersion",color="teal")
plot_category_percent_of_target("AppVersion",color="Purples")
plot_category_percent_of_target("OsPlatformSubRelease",color="ice")
plot_category_percent_of_target("OsBuildLab",color="speed")
plot_category_percent_of_target("OsBuildLab",color="Magenta")
plot_category_percent_of_target("SkuEdition",color="balance")
plot_category_percent_of_target("SmartScreen",color="Portland")
plot_category_percent_of_target("Census_MDC2FormFactor",color="Redor")
plot_category_percent_of_target("Census_PrimaryDiskTypeName",color="Agsunset")
plot_category_percent_of_target("Census_ChassisTypeName",color="Deep")
plot_category_percent_of_target("Census_PowerPlatformRoleName",color="RdPu")
plot_category_percent_of_target("Census_OSVersion",color="Sunset")
plot_category_percent_of_target("Census_OSBranch",color="Oryel")
plot_category_percent_of_target("Census_OSEdition",color="Haline")
plot_category_percent_of_target("Census_OSSkuName",color="DarkMint")
plot_category_percent_of_target("Census_OSInstallTypeName",color="Reds")
plot_category_percent_of_target("Census_OSWUAutoUpdateOptionsName",color="Ice")
plot_category_percent_of_target("Census_ActivationChannel",color="Mint")